{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2idx = {'PAD': 0,'NUM':1,'UNK':2}\n",
    "tag2idx = {'PAD': 0}\n",
    "char2idx = {'PAD': 0}\n",
    "word_idx = 3\n",
    "tag_idx = 1\n",
    "char_idx = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_word(word, lower=False):\n",
    "    if lower:\n",
    "        word = word.lower()\n",
    "    if word.isdigit():\n",
    "        word = 'NUM'\n",
    "    else:\n",
    "        word = re.sub('[^A-Za-z0-9\\- ]+', '', word)\n",
    "    return word\n",
    "\n",
    "def read_file(file):\n",
    "    global word_idx, tag_idx, char_idx\n",
    "    with open(file,'r') as f:\n",
    "        words, tags, X, Y = [], [], [], []\n",
    "        for line in f:\n",
    "            line = line.strip()\n",
    "            if (len(line) == 0 or line.startswith(\"-DOCSTART-\")):\n",
    "                continue\n",
    "            else:\n",
    "                ls = line.split(' ')\n",
    "                if len(ls) > 1:\n",
    "                    word, tag = ls[0],ls[-1]\n",
    "                else:\n",
    "                    word = ls[0]\n",
    "                    tag = 'O'\n",
    "                for c in word:\n",
    "                    if c not in char2idx:\n",
    "                        char2idx[c] = char_idx\n",
    "                        char_idx += 1\n",
    "                word = process_word(word,True)\n",
    "                if len(word) < 1:\n",
    "                    continue\n",
    "                words += [word]\n",
    "                tags += [tag]\n",
    "                if word not in word2idx:\n",
    "                    word2idx[word] = word_idx\n",
    "                    word_idx += 1\n",
    "                X.append(word2idx[word])\n",
    "                if tag not in tag2idx:\n",
    "                    tag2idx[tag] = tag_idx\n",
    "                    tag_idx += 1\n",
    "                Y.append(tag2idx[tag])\n",
    "                        \n",
    "    return words, tags, X, Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_words, train_tags, train_X, train_Y = read_file('eng.train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_words, test_tags, test_X, test_Y = read_file('eng.testa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx2tag={idx: tag for tag, idx in tag2idx.items()}\n",
    "idx2word={idx: tag for tag, idx in word2idx.items()}\n",
    "batch_size = 64\n",
    "dim_word = 128\n",
    "dim_char = 32\n",
    "dropout = 0.8\n",
    "learning_rate = 1e-3\n",
    "hidden_size_char = 64\n",
    "hidden_size_word = 128\n",
    "num_layers = 2\n",
    "seq_len = 20\n",
    "display_step = 50\n",
    "epoch = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self, dim_word, dim_char, dropout, learning_rate,\n",
    "                 hidden_size_char, hidden_size_word, num_layers):\n",
    "        \n",
    "        def cells(size, reuse=False):\n",
    "            return tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)\n",
    "        \n",
    "        def clip_grads(loss):\n",
    "            variables = tf.trainable_variables()\n",
    "            grads = tf.gradients(loss, variables)\n",
    "            clipped_grads, _ = tf.clip_by_global_norm(grads, 5.0)\n",
    "            return zip(clipped_grads, variables)\n",
    "        \n",
    "        self.word_ids = tf.placeholder(tf.int32, shape=[None, None])\n",
    "        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None])\n",
    "        self.labels = tf.placeholder(tf.int32, shape=[None, None])\n",
    "        \n",
    "        self.word_embeddings = tf.Variable(tf.truncated_normal([len(word2idx), dim_word],\n",
    "                                                      stddev=1.0 / np.sqrt(dim_word)))\n",
    "        self.char_embeddings = tf.Variable(tf.truncated_normal([len(char2idx), dim_char],\n",
    "                                                      stddev=1.0 / np.sqrt(dim_char)))\n",
    "        word_embedded = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids)\n",
    "        char_embedded = tf.nn.embedding_lookup(self.char_embeddings, self.char_ids)\n",
    "        s = tf.shape(char_embedded)\n",
    "        char_embedded = tf.reshape(char_embedded, shape=[s[0]*s[1], s[-2], dim_char])\n",
    "        for n in range(num_layers):\n",
    "            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(\n",
    "                cell_fw = cells(hidden_size_char),\n",
    "                cell_bw = cells(hidden_size_char),\n",
    "                inputs = char_embedded,\n",
    "                dtype = tf.float32,\n",
    "                scope = 'bidirectional_rnn_char_%d'%(n))\n",
    "            char_embedded = tf.concat((out_fw, out_bw), 2)\n",
    "        output = tf.reshape(char_embedded[:,-1], shape=[s[0], s[1], 2*hidden_size_char])\n",
    "        word_embedded = tf.concat([word_embedded, output], axis=-1)\n",
    "        word_embedded = tf.nn.dropout(word_embedded, dropout)\n",
    "        \n",
    "        for n in range(num_layers):\n",
    "            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(\n",
    "                cell_fw = cells(hidden_size_word),\n",
    "                cell_bw = cells(hidden_size_word),\n",
    "                inputs = word_embedded,\n",
    "                dtype=tf.float32,\n",
    "                scope = 'bidirectional_rnn_word_%d'%(n))\n",
    "            word_embedded = tf.concat((out_fw, out_bw), 2)\n",
    "        word_embedded = tf.nn.dropout(word_embedded, dropout)\n",
    "        \n",
    "        W = tf.get_variable('w',shape=(2*hidden_size_word, len(idx2tag)),\n",
    "                            initializer=tf.orthogonal_initializer())\n",
    "        b = tf.get_variable('b',shape=(len(idx2tag)),initializer=tf.zeros_initializer())\n",
    "        \n",
    "        nsteps = tf.shape(word_embedded)[1]\n",
    "        output = tf.reshape(word_embedded, [-1, 2*hidden_size_word])\n",
    "        pred = tf.matmul(output, W) + b\n",
    "        self.logits = tf.reshape(pred, [-1, nsteps, len(idx2tag)])\n",
    "        \n",
    "        log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(\n",
    "        self.logits, self.labels, tf.count_nonzero(self.word_ids, 1))\n",
    "        \n",
    "        self.cost = tf.reduce_mean(-log_likelihood)\n",
    "        self.global_step = tf.Variable(0, trainable=False)\n",
    "        \n",
    "        self.learning_rate = tf.train.exponential_decay(learning_rate,\n",
    "                                                        self.global_step, len(train_X) // batch_size,\n",
    "                                                        0.1)\n",
    "        \n",
    "        self.crf_decode = tf.contrib.crf.crf_decode(self.logits, \n",
    "                                                    trans_params, \n",
    "                                                    tf.count_nonzero(self.word_ids, 1))[0]\n",
    "        \n",
    "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(clip_grads(self.cost), \n",
    "                                                                                    global_step=self.global_step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:96: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def iter_seq(x):\n",
    "    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])\n",
    "\n",
    "def to_train_seq(*args):\n",
    "    return [iter_seq(x) for x in args]\n",
    "\n",
    "def generate_char_seq(batch):\n",
    "    x = [[len(idx2word[i]) for i in k] for k in batch]\n",
    "    maxlen = max([j for i in x for j in i])\n",
    "    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)\n",
    "    for i in range(batch.shape[0]):\n",
    "        for k in range(batch.shape[1]):\n",
    "            for no, c in enumerate(idx2word[batch[i,k]]):\n",
    "                temp[i,k,-1-no] = char2idx[c]\n",
    "    return temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X_seq, train_Y_seq = to_train_seq(train_X, train_Y)\n",
    "test_X_seq, test_Y_seq = to_train_seq(test_X, test_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([1, 2, 3, 4, 5, 6, 7, 8]),\n",
       " array([ 198310, 2908994,   90686,  222459,  165511,     220,     740,\n",
       "            480]))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_Y_seq.ravel(),return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([1, 2, 3, 4, 5, 7]),\n",
       " array([ 40886, 731743,  25207,  62733,  41871,     80]))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(test_Y_seq.ravel(),return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, step 1, loss 43.618607\n",
      "epoch 1, step 50, loss 12.042439\n",
      "epoch 1, step 100, loss 27.015694\n",
      "epoch 1, step 150, loss 19.762203\n",
      "epoch 1, step 200, loss 8.836092\n",
      "epoch 1, step 250, loss 12.508820\n",
      "epoch 1, step 300, loss 6.588864\n",
      "epoch 1, step 350, loss 7.509544\n",
      "epoch 1, step 400, loss 3.870700\n",
      "epoch 1, step 450, loss 8.295753\n",
      "epoch 1, step 500, loss 12.325174\n",
      "epoch 1, step 550, loss 4.284832\n",
      "epoch 1, step 600, loss 4.885774\n",
      "epoch 1, step 650, loss 2.738923\n",
      "epoch 1, step 700, loss 8.448340\n",
      "epoch 1, step 750, loss 1.698012\n",
      "epoch 1, step 800, loss 7.499028\n",
      "epoch 1, step 850, loss 3.572803\n",
      "epoch 1, step 900, loss 10.391118\n",
      "epoch 1, step 950, loss 1.823482\n",
      "epoch 1, step 1000, loss 1.544621\n",
      "epoch 1, step 1050, loss 4.242450\n",
      "epoch 1, step 1100, loss 3.664677\n",
      "epoch 1, step 1150, loss 7.973126\n",
      "epoch 1, step 1200, loss 5.950489\n",
      "epoch 1, step 1250, loss 6.812481\n",
      "epoch 1, step 1300, loss 2.563293\n",
      "epoch 1, step 1350, loss 1.441920\n",
      "epoch 1, step 1400, loss 4.230231\n",
      "epoch 1, step 1450, loss 4.453869\n",
      "epoch 1, step 1500, loss 5.672961\n",
      "epoch 1, step 1550, loss 11.875997\n",
      "epoch 1, step 1600, loss 3.044885\n",
      "epoch 1, step 1650, loss 2.946229\n",
      "epoch 1, step 1700, loss 3.574232\n",
      "epoch 1, step 1750, loss 2.958346\n",
      "epoch 1, step 1800, loss 1.697220\n",
      "epoch 1, step 1850, loss 4.913888\n",
      "epoch 1, step 1900, loss 3.025366\n",
      "epoch 1, step 1950, loss 6.279280\n",
      "epoch 1, step 2000, loss 8.284986\n",
      "epoch 1, step 2050, loss 4.095070\n",
      "epoch 1, step 2100, loss 4.019399\n",
      "epoch 1, step 2150, loss 1.684503\n",
      "epoch 1, step 2200, loss 0.392990\n",
      "epoch 1, step 2250, loss 5.035411\n",
      "epoch 1, step 2300, loss 2.516311\n",
      "epoch 1, step 2350, loss 4.100960\n",
      "epoch 1, step 2400, loss 18.148668\n",
      "epoch 1, step 2450, loss 4.546373\n",
      "epoch 1, step 2500, loss 3.523759\n",
      "epoch 1, step 2550, loss 6.172002\n",
      "epoch 1, step 2600, loss 10.173368\n",
      "epoch 1, step 2650, loss 2.837925\n",
      "epoch 1, step 2700, loss 9.974754\n",
      "epoch 1, step 2750, loss 9.797827\n",
      "epoch 1, step 2800, loss 5.975362\n",
      "epoch 1, avg loss 7.219527\n"
     ]
    }
   ],
   "source": [
    "for i in range(epoch):\n",
    "    total_cost = 0\n",
    "    for k in range(0,(train_X_seq.shape[0] // batch_size)*batch_size,batch_size):\n",
    "        batch_x = train_X_seq[k:k+batch_size]\n",
    "        batch_y = train_Y_seq[k:k+batch_size]\n",
    "        batch_x_char = generate_char_seq(batch_x)\n",
    "        step, loss, _ = sess.run([model.global_step, model.cost, model.optimizer],\n",
    "                                 feed_dict={model.word_ids:batch_x,\n",
    "                                           model.char_ids:batch_x_char,\n",
    "                                           model.labels:batch_y})\n",
    "        if step % display_step == 0 or step == 1:\n",
    "            print('epoch %d, step %d, loss %f'%(i+1,step,loss))\n",
    "        total_cost += loss\n",
    "    total_cost /= (train_X_seq.shape[0] // batch_size)\n",
    "    print('epoch %d, avg loss %f'%(i+1,total_cost))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_Y, predicted_Y = [], []\n",
    "for k in range(0,(test_X_seq.shape[0] // batch_size)*batch_size,batch_size):\n",
    "    batch_x = test_X_seq[k:k+batch_size]\n",
    "    batch_x_char = generate_char_seq(batch_x)\n",
    "    batch_y = test_Y_seq[k:k+batch_size]\n",
    "    Y_pred = sess.run(model.crf_decode,\n",
    "                  feed_dict={model.word_ids:batch_x,\n",
    "                            model.char_ids:batch_x_char})\n",
    "    predicted_Y.append(Y_pred)\n",
    "    label_Y.append(batch_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "        PAD       0.46      0.46      0.46     40883\n",
      "      I-PER       0.96      0.99      0.97    731626\n",
      "     B-MISC       0.68      0.25      0.37     25207\n",
      "      I-ORG       0.79      0.64      0.71     62733\n",
      "          O       0.75      0.77      0.76     41871\n",
      "      B-ORG       0.00      0.00      0.00        80\n",
      "\n",
      "avg / total       0.90      0.91      0.90    902400\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1428: UserWarning: labels size, 6, does not match size of target_names, 9\n",
      "  .format(len(labels), len(target_names))\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(np.vstack(label_Y).ravel(), np.vstack(predicted_Y).ravel(), target_names=tag2idx.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
